A new post by hswerdfe
Data quality sucks, and we have to match records togeather.
library(knitr)
library(tidyverse)
library(janitor)
library(lubridate)
library(fuzzyjoin) # for common mispellings
#library(maps) # for city Names, but I don't want to override purrr:map
library(babynames) # for first names
library(lexicon) # for Last names
library(reclin) # for de-duplication
library(phonics) # for soundex
library(plotROC) # for ROC curves
library(AUC)# for AUC calculations curves
library(magrittr) #extract2
library(igraph) # Neigbourhood determination
library(lemon)
library(snakecase)
theme_set(theme_minimal())
knit_print.data.frame <- lemon_print
set.seed(as.integer(as.Date("2021-04-09")))
g_num_entities = 72
g_num_dup_max = 12
g_prob_error = 0.05
g_prob_miss = 0.05
Generate a dataset with:
#'
#' get a single character out of a string
#'
#' s2c(i = 1, "data Doo Doo")
s2c <- function(i, str, ...){
substr(x = str, start = i,stop = i, ...)
}
#'
#' change a string to a vector
#'
#'example:
#' map_s2c("data Doo Doo")
map_s2c<- function(str, ... ){
purrr::map(1:nchar(str), s2c, str = str, ...) %>% unlist()
}
#'
#' Randomly edit some string
#'
#' random_edit("billy bob thorton")
random_edit <- function(str, prob_char = g_prob_error, sub_in = letters){
if_else(
sample(x = c(T, F), size = nchar(str), prob = c(prob_char,1-prob_char), replace = TRUE),
sample(x = sub_in, size = nchar(str)),
map_s2c(str)
) %>% paste0(collapse = "")
}
#'
#' Generate a base set of entities
#'
#' generate_entities(10)
generate_entities <- function(num_entities = g_num_entities, max_rep = g_num_dup_max){
dts_for_day_of_year = sample(seq(as.Date("1900-01-01"), as.Date("1901-12-31"),by = "day"), size = num_entities, replace = TRUE)
tibble(
first_name =
babynames::babynames %>%
group_by(name) %>% summarise(n = sum(n, na.rm = TRUE)) %>%
sample_n(size = num_entities, replace = TRUE, weight = n) %>%
pull(name),
middle_name =
babynames::babynames %>%
group_by(name) %>% summarise(n = sum(n, na.rm = TRUE)) %>%
sample_n(size = num_entities, replace = TRUE, weight = n) %>%
pull(name) ,
last_name =
lexicon::freq_last_names %>%
clean_names() %>%
sample_n(size = num_entities, replace = TRUE, weight = prop) %>%
pull(surname),
city =
maps::world.cities %>%
clean_names() %>%
filter(country_etc =="USA") %>%
sample_n(size = num_entities, replace = TRUE, weight = pop) %>% pull(name),
dob_year =
babynames::babynames %>%
group_by(year) %>%
summarise(n = sum(n, na.rm = TRUE)) %>%
sample_n(size = num_entities, replace = TRUE, weight = n) %>%
pull(year),
dob_month = month(dts_for_day_of_year),
dob_day = day(dts_for_day_of_year)
) %>%
mutate(dob = ISOdate(dob_year, dob_month, dob_day)) %>%
#select(-dob_year, -dob_month, -dob_day) %>%
mutate(., key = 1:nrow(.)) %>%
mutate(n = sample(1:max_rep, size = num_entities, replace = TRUE))
}
In this Dataset every record has a name a city and a date of birth.
Additionally it has a key we will use that later to figure out which record is which.
We also generate n which is how many times we will duplicate that record and make random edits to simulate noise.
entities <- generate_entities()
entities %>% sample_n(7)
# A tibble: 7 x 10
first_name middle_name last_name city dob_year dob_month dob_day
<chr> <chr> <chr> <chr> <dbl> <dbl> <int>
1 Patrick Veva Mcdonald New York 1959 10 26
2 Gemma Ricki Padilla Houston 1989 6 10
3 Joshua Kenneth Anderson Los Ang~ 2016 12 21
4 Alyssa Lillian James San Jose 1991 6 24
5 Sherri Jaret Jackson Tucson 1943 7 29
6 Charles Roger Jensen Los Ang~ 1950 10 17
7 Thiago Lindsay Johnson Charles~ 2007 5 7
# ... with 3 more variables: dob <dttm>, key <int>, n <int>
Make the data noisy, by making random edits to 0.05 and setting 0.05 of random cells to NA. notice how the names clearly don’t look as clean.
noisy <-
entities %>%
uncount(weights = n) %>%
#select(-key) %>%
#head(5000) %>%
mutate(first_name = purrr::map(first_name, random_edit) %>% unlist()) %>%
mutate(middle_name = purrr::map(middle_name, random_edit) %>% unlist()) %>%
mutate(last_name = purrr::map(last_name, random_edit) %>% unlist()) %>%
mutate(city = purrr::map(city, random_edit) %>% unlist()) %>%
mutate(dob_year = purrr::map(dob_year, random_edit, sub_in = as.character(0:9)) %>% unlist() %>% as.integer()) %>%
mutate(dob_month = purrr::map(dob_month, random_edit, sub_in = as.character(1:12)) %>% unlist() %>% as.integer()) %>%
mutate(dob_day = purrr::map(dob_day, random_edit, sub_in = as.character(1:31)) %>% unlist() %>% as.integer()) %>%
mutate(dob = ISOdate(dob_year, dob_month, dob_day)) %>%
select(-dob_year, -dob_month, -dob_day) %>%
mutate(.,first_name = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
as.character(NA), first_name)) %>%
mutate(.,last_name = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
as.character(NA), last_name)) %>%
mutate(.,city = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
as.character(NA), city)) %>%
mutate(.,dob = if_else(sample(x = c(T,F), size = nrow(.), replace = T, prob = c(g_prob_miss, 1-g_prob_miss)),
ISOdate(99, 99, 99), dob))
noisy %>% sample_n(7)
# A tibble: 7 x 6
first_name middle_name last_name city dob key
<chr> <chr> <chr> <chr> <dttm> <int>
1 Darlene Doris Jawssen New Brau~ 1950-06-15 12:00:00 68
2 Carolyn Samuel Agustin Newton 1943-06-06 12:00:00 6
3 Jeremy Hayden Daniels Schenect~ 1952-11-04 12:00:00 34
4 Pclly Mckexzie randy Riverside 1970-03-01 12:00:00 9
5 Sherki Jarkt eackson Tucson 1943-07-29 12:00:00 31
6 Alyssa Lillian James San Jose 1991-06-24 12:00:00 70
7 Jamie Ariana Wriyht <NA> 1998-01-04 12:00:00 16
key_of_interest <-
noisy %>%
count(key, sort = T) %>%
slice(as.integer(g_num_entities/2)) %>%
pull(key)
noisy %>%
filter(key == key_of_interest)
# A tibble: 6 x 6
first_name middle_name last_name city dob key
<chr> <chr> <chr> <chr> <dttm> <int>
1 Txtry vanzel Waluh Jackoonv~ 1974-02-05 12:00:00 32
2 Terry Daniel Waxsh Jacksonv~ 1974-02-05 12:00:00 32
3 qerry Daniel Walsh Jadzsmnv~ 1974-02-05 12:00:00 32
4 Terry faniel Walsk Japzsonv~ 1974-02-05 12:00:00 32
5 Terry Daninl Walsh Jacksonv~ 1974-03-05 12:00:00 32
6 Txrry Danieg <NA> Jacksonv~ 1974-02-05 12:00:00 32
Each column will give us some indication that it is the same person, but each column by it self will not be enough to tell us, so we will look at multiple columns. There are various metrics we can look at
noisy <-
noisy %>%
select(-key, -dob) %>%
mutate_all(phonics::soundex) %>%
rename_all(~paste0(.x, "_soundex")) %>%
bind_cols(noisy, .)
noisy %>% sample_n(7)
# A tibble: 7 x 10
first_name middle_name last_name city dob key
<chr> <chr> <chr> <chr> <dttm> <int>
1 Patricia Serena Dunn Saint Pe~ 1963-08-13 12:00:00 65
2 Tom Caitlin Brooks Enid 1951-03-04 12:00:00 11
3 Hattie Cameron Regmlado Fairfield 2007-10-11 12:00:00 2
4 Raymond Margaret Mason Huntingt~ 1620-07-30 12:00:00 30
5 mubrles Royer Jensen Los Ange~ 1950-10-17 12:00:00 20
6 Jonathan nary Caron Houston 2008-12-16 12:00:00 47
7 Carolyn Samuwl Agustin Newton 1943-06-06 12:00:00 6
# ... with 4 more variables: first_name_soundex <chr>,
# middle_name_soundex <chr>, last_name_soundex <chr>,
# city_soundex <chr>
#columns to do comparisons on
cols_4_simmilarity <- noisy %>% select(-key) %>% colnames()
cols_4_simmilarity
[1] "first_name" "middle_name" "last_name"
[4] "city" "dob" "first_name_soundex"
[7] "middle_name_soundex" "last_name_soundex" "city_soundex"
Blocking is how you limit the computational complexity of what you are doing, here I am blocking on any one variable being identical, which seems reasonable as every column could be wrong with some small probability.
#
# blocking pairs with at least on cell, any cell identical
#
p1 <-
noisy %>%
select(-key) %>%
colnames() %>%
map_dfr(~pair_blocking(noisy, noisy, blocking_var=.x, large = F))
#
# pairs with no blocking (this is much longer)
#
pa <- pair_blocking(x= noisy, y = noisy, large = F)
#moving forward with any one column being same for computational reasons
p <- pa#p1
p
Simple blocking
No blocking used.
First data set: 429 records
Second data set: 429 records
Total number of pairs: 184 041 pairs
Showing first 20 pairs:
x y
1 1 1
2 2 1
3 3 1
4 4 1
5 5 1
6 6 1
7 7 1
8 8 1
9 9 1
10 10 1
11 11 1
12 12 1
13 13 1
14 14 1
15 15 1
16 16 1
17 17 1
18 18 1
19 19 1
20 20 1
Above p1 has 46271 rows, while pa has 184041 rows. this is on 429 records and 72 people, and this discrepency gets larger as our number of records get larger. Here I am using Pa and p1 for potential pairs
#'
#' Custom Date compare function
#'
date_comp <- function(date_part){
function(x, y) {
if (!missing(y)) {
if (date_part == "m"){
x %>% month() == y %>% month()
}else if(date_part == "d"){
x %>% day() == y %>% day()
}else if(date_part == "y"){
abs(x %>% year() - y %>% year())
}else{
0
}
}
else {
(x > threshold) & !is.na(x)
}
}
}
#date_comp
We now have pairs to compare we look at how close individual cells in matching pairs are to each other, to do this we generate some comparisons. So here we generate 4 separate ways of comparing the strings across 3 different models. All the models will share soundex as a potential matching method. then each model will specialize in either jaro winkler, Longest Common Substring, or Jaccard simmilarity.
#########################
#soundex columns should be identical
func_list_by_col <-
list(first_name_soundex = identical(),
middle_name_soundex = identical(),
last_name_soundex = identical(),
city_soundex = identical())
########################
# The library does not seem to support multiple comparitors on the same column at the same time so we do this, for the next best thing
default_comparator_list <-
list(jw = jaro_winkler(),
lcs = lcs(),
jaccard = jaccard()
)
############################
# generate similarities for all the matching pairs
p_s <-
default_comparator_list %>%
map(function(func){
compare_pairs(pairs = p,
by = cols_4_simmilarity,
comparators = func_list_by_col,
default_comparator = func)
})
names(p_s) <- names(default_comparator_list)
Here we sample from the pairs with similarities p_s to see that we now have many numerical values indicating how close each x y pair is along various metrics, higher numbers in general mean more likely to be a match as does TRUE in the binary columns.
ps_combined <- p_s %>% bind_rows(.id = 'model_name')
ps_combined %>% sample_n(5) %>% as_tibble()
# A tibble: 5 x 12
model_name x y first_name middle_name last_name city dob
<chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 jw 323 104 0.483 0.421 NA 0.519 0.826
2 jaccard 77 296 0 0 0 0 NA
3 jw 130 115 0.442 0.556 0.625 0.448 0.782
4 lcs 399 377 0.333 0.75 NA 0.308 NA
5 jw 281 28 0 0.448 0 0.423 0.815
# ... with 4 more variables: first_name_soundex <lgl>,
# middle_name_soundex <lgl>, last_name_soundex <lgl>,
# city_soundex <lgl>
From here we try to go from potential pairs to true pairs. The easiest thing to do is sum across all the numeric columns, and say that higher number as more likely to be a match.
###############################
# add similarities
p_s <-
p_s %>%
map(score_simsum, var = "simpl_sum")
ps_combined <- p_s %>% bind_rows(.id = 'model_name')
ps_combined %>%
ggplot(aes(x = simpl_sum, color = model_name)) +
geom_density() +
labs(color = "Model", x = "Total Simmilarity Score", y = "Density", title = "Density of total scores of potential pairs by model")
# default_comparator_list <-
# list(jw = jaro_winkler(),
# lcs = lcs(),
# jaccard = jaccard()
# )
# p_s <-
# default_comparator_list %>%
# map(function(func){
# compare_pairs(pairs = p,
# by = cols_4_simmilarity,
# comparators = func_list_by_col,
# default_comparator = func)
# })
# names(p_s) <- names(default_comparator_list)
The EM Algorithm from the 2007 book Data Quality and Record Linkage Techniques generates the probability that each column contains information about weather the record pair is or is not a match
em_s <-
p_s %>%
map(problink_em)
em_s
$jw
M- and u-probabilities estimated by the EM-algorithm:
Variable M-probability U-probability
first_name 0.4261385 1.239424e-09
middle_name 0.6986917 4.067077e-09
last_name 0.4094873 9.044788e-16
city 0.4096737 3.182302e-03
dob 0.4764895 1.187817e-03
first_name_soundex 0.4841342 4.286314e-03
middle_name_soundex 0.7627071 8.458971e-04
last_name_soundex 0.4743486 9.135294e-04
city_soundex 0.3774117 5.705523e-04
Matching probability: 0.02480617.
$lcs
M- and u-probabilities estimated by the EM-algorithm:
Variable M-probability U-probability
first_name 0.6758350 6.074621e-06
middle_name 0.7881897 7.229890e-03
last_name 0.7160520 2.463114e-14
city 0.7615701 7.337231e-03
dob 0.7425143 5.190125e-03
first_name_soundex 0.6031999 4.284000e-03
middle_name_soundex 0.7067278 5.810422e-03
last_name_soundex 0.5981588 7.827820e-04
city_soundex 0.4508693 9.752591e-04
Matching probability: 0.01989889.
$jaccard
M- and u-probabilities estimated by the EM-algorithm:
Variable M-probability U-probability
first_name 0.4363094 1.486481e-09
middle_name 0.6933837 1.558471e-08
last_name 0.4192609 7.779157e-16
city 0.4171563 3.170574e-03
dob 0.4705465 5.812155e-04
first_name_soundex 0.4952393 4.294945e-03
middle_name_soundex 0.7576207 1.423543e-03
last_name_soundex 0.4852706 9.229072e-04
city_soundex 0.3868390 5.598046e-04
Matching probability: 0.02422578.
p_s <-
p_s %>%
map(function(p_i){
score_problink(p_i, var = "em_weight")
})
ps_combined <- p_s %>% bind_rows(.id = 'model_name')
ps_combined %>%
ggplot(aes(x = em_weight, color = model_name)) +
geom_density() +
labs(color = "Model", x = "EM Weight Score", y = "Density", title = "Density of total scores of potential pairs by model")
Generate a results the is_same column tells us if they are truely from the same person.
p_results <-
map2_dfr(names(p_s), p_s, function(nm, p){
p$def_func = nm
p %>% as_tibble()
})
noisy_kr <-
noisy %>%
mutate(., row = 1:nrow(.)) %>%
select("key", "row")
p_results <-
p_results %>%
left_join(noisy_kr %>% set_names(c("key_x", "x")), by = "x") %>%
left_join(noisy_kr %>% set_names(c("key_y", "y")), by = "y") %>%
mutate(is_same = key_x == key_y)
p_results %>% summary()
x y first_name middle_name
Min. : 1 Min. : 1 Min. :0.00 Min. :0.0000
1st Qu.:108 1st Qu.:108 1st Qu.:0.00 1st Qu.:0.0000
Median :215 Median :215 Median :0.15 Median :0.1667
Mean :215 Mean :215 Mean :0.21 Mean :0.2191
3rd Qu.:322 3rd Qu.:322 3rd Qu.:0.43 3rd Qu.:0.4365
Max. :429 Max. :429 Max. :1.00 Max. :1.0000
NA's :60048
last_name city dob first_name_soundex
Min. :0.00 Min. :0.00 Min. :0.11 Mode :logical
1st Qu.:0.00 1st Qu.:0.00 1st Qu.:0.40 FALSE:483138
Median :0.15 Median :0.18 Median :0.56 TRUE :8937
Mean :0.21 Mean :0.23 Mean :0.58 NA's :60048
3rd Qu.:0.43 3rd Qu.:0.42 3rd Qu.:0.78
Max. :1.00 Max. :1.00 Max. :1.00
NA's :55176 NA's :55176 NA's :109755
middle_name_soundex last_name_soundex city_soundex
Mode :logical Mode :logical Mode :logical
FALSE:541224 FALSE:489960 FALSE:245088
TRUE :10899 TRUE :6987 TRUE :5475
NA's :55176 NA's :301560
simpl_sum em_weight def_func
Min. :0.0000 Min. : -9.408 Length:552123
1st Qu.:0.4913 1st Qu.: -5.990 Class :character
Median :1.2121 Median : -5.070 Mode :character
Mean :1.3285 Mean : -3.373
3rd Qu.:1.8183 3rd Qu.: -3.085
Max. :9.0000 Max. :107.512
key_x key_y is_same
Min. : 1.00 Min. : 1.00 Mode :logical
1st Qu.:16.00 1st Qu.:16.00 FALSE:541530
Median :35.00 Median :35.00 TRUE :10593
Mean :34.66 Mean :34.66
3rd Qu.:51.00 3rd Qu.:51.00
Max. :72.00 Max. :72.00
p_results
# A tibble: 552,123 x 17
x y first_name middle_name last_name city dob
<int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1 1 1
2 2 1 1 0.778 0.867 0.917 NA
3 3 1 1 0.889 0.867 1 1
4 4 1 1 0.889 0.867 1 1
5 5 1 1 0.778 0.867 1 0.919
6 6 1 1 0.889 0.867 1 1
7 7 1 0.889 0.889 0.867 0.917 0.965
8 8 1 0.889 0.889 0.867 0.833 1
9 9 1 1 0.889 0.733 0.917 1
10 10 1 1 0.889 0.867 1 1
# ... with 552,113 more rows, and 10 more variables:
# first_name_soundex <lgl>, middle_name_soundex <lgl>,
# last_name_soundex <lgl>, city_soundex <lgl>, simpl_sum <dbl>,
# em_weight <dbl>, def_func <chr>, key_x <int>, key_y <int>,
# is_same <lgl>
p_results %>%
select(-x, -y) %>%
mutate_if(is.logical, as.double) %>%
pivot_longer(!def_func) %>%
mutate(def_func = if_else(grepl("soundex",name), "soundex", def_func)) %>%
mutate(name = if_else(grepl("soundex",name), gsub("_soundex", "", name), name)) %>%
filter(!name %in% c('key_x','key_y','is_same')) %>%
#mutate(name = snakecase::to_title_case(name)) %>%
ggplot(aes(x = value, color = def_func)) +
geom_density() +
# facet_grid(cols = vars(name), rows = vars(def_func), scales = "free")
facet_wrap(~ name, scales = "free") +
labs(color = "model", title = "similarity metrics distribution across each column and each model")
p_results %>%
select(simpl_sum, em_weight, def_func, is_same) %>%
pivot_longer(., cols=c("simpl_sum", "em_weight")) %>%
mutate(mdl_nm = paste0(def_func, "_", name ))
# A tibble: 1,104,246 x 5
def_func is_same name value mdl_nm
<chr> <lgl> <chr> <dbl> <chr>
1 jw TRUE simpl_sum 8 jw_simpl_sum
2 jw TRUE em_weight 101. jw_em_weight
3 jw TRUE simpl_sum 4.56 jw_simpl_sum
4 jw TRUE em_weight 26.1 jw_em_weight
5 jw TRUE simpl_sum 6.76 jw_simpl_sum
6 jw TRUE em_weight 44.3 jw_em_weight
7 jw TRUE simpl_sum 6.76 jw_simpl_sum
8 jw TRUE em_weight 44.3 jw_em_weight
9 jw TRUE simpl_sum 5.56 jw_simpl_sum
10 jw TRUE em_weight 31.1 jw_em_weight
# ... with 1,104,236 more rows
Looking at the model results from the ROC perspective I can say, that clearly I did not make this problem hard enough, and well I am evaluating on my training set. Moving one
p_results %>%
select(simpl_sum, em_weight, def_func, is_same) %>%
pivot_longer(., cols=c("simpl_sum", "em_weight")) %>%
mutate(mdl_nm = paste0(def_func, "_", name )) %>%
group_by(mdl_nm) %>%
mutate(auc = AUC::auc(AUC::roc(value , as.factor(is_same)))) %>%
ungroup() %>%
mutate(mdl_nm = paste0("(", round(auc, 5), ") ", mdl_nm )) %>%
mutate(mdl_nm = fct_reorder(mdl_nm, auc)) %>%
ggplot(aes(d = is_same, m = value, color = mdl_nm)) +
geom_roc(n.cuts = 10) +
style_roc() +
labs()
###Select Best Model
p_results %>%
select(simpl_sum, em_weight, def_func, is_same) %>%
pivot_longer(., cols=c("simpl_sum", "em_weight")) %>%
mutate(mdl_nm = paste0(def_func, "_", name )) %>%
group_by(mdl_nm) %>%
mutate(auc = AUC::auc(AUC::roc(value , as.factor(is_same)))) %>%
ungroup() %>%
mutate(mdl_nm = paste0("(", auc, ") ", mdl_nm )) %>%
mutate(mdl_nm = fct_reorder(mdl_nm, auc)) %>%
filter(auc == max(auc)) %>%
ggplot(aes(d = is_same, m = value, color = mdl_nm)) +
geom_roc(n.cuts = 20) +
style_roc() +
labs()
best_model_results <-
p_results %>%
select(simpl_sum, em_weight, def_func, is_same, x, y) %>%
pivot_longer(., cols=c("simpl_sum", "em_weight")) %>%
mutate(mdl_nm = paste0(def_func, "_", name )) %>%
group_by(mdl_nm) %>%
mutate(auc = AUC::auc(AUC::roc(value , as.factor(is_same)))) %>%
ungroup() %>%
filter(auc == max(auc))
best_model_results
# A tibble: 184,041 x 8
def_func is_same x y name value mdl_nm auc
<chr> <lgl> <int> <int> <chr> <dbl> <chr> <dbl>
1 lcs TRUE 1 1 em_weight 73.3 lcs_em_weight 1.00
2 lcs TRUE 2 1 em_weight 17.8 lcs_em_weight 1.00
3 lcs TRUE 3 1 em_weight 32.6 lcs_em_weight 1.00
4 lcs TRUE 4 1 em_weight 32.6 lcs_em_weight 1.00
5 lcs TRUE 5 1 em_weight 22.7 lcs_em_weight 1.00
6 lcs TRUE 6 1 em_weight 32.6 lcs_em_weight 1.00
7 lcs TRUE 7 1 em_weight 10.4 lcs_em_weight 1.00
8 lcs TRUE 8 1 em_weight 12.7 lcs_em_weight 1.00
9 lcs TRUE 9 1 em_weight 28.8 lcs_em_weight 1.00
10 lcs TRUE 10 1 em_weight 32.6 lcs_em_weight 1.00
# ... with 184,031 more rows
best_model_results %>%
mutate(value_cut = cut_quantiles(value)) %>%
group_by(value_cut, is_same) %>%
summarise(n = n()) %>% ungroup() %>%
group_by(value_cut) %>% mutate(f = n/sum(n)) %>% ungroup() %>%
ggplot(aes(x = value_cut, y = f, fill = is_same)) + geom_col() + coord_flip()
best_model_results %>%
arrange(value) %>%
mutate(f_cumsum_true=cumsum(is_same)/sum(is_same)) %>%
mutate(f_cumsum_false=cumsum(!is_same)/sum(!is_same)) %>%
rename(score:=value,
score_typ:=name) %>%
pivot_longer(cols = c(f_cumsum_true, f_cumsum_false)) %>%
ggplot(aes(x=score, y=value, color= name)) + geom_line()
Threshold <-
best_model_results %>%
arrange(value) %>%
mutate(f_cumsum_true=cumsum(is_same)/sum(is_same)) %>%
mutate(f_cumsum_false=cumsum(!is_same)/sum(!is_same)) %>%
arrange(abs(f_cumsum_false - 0.9995)) %>%
slice(1) %>% pull(value)
Threshold
[1] 8.101735
Threshold2 <-
best_model_results %>%
arrange(value) %>%
mutate(f_cumsum_true=cumsum(is_same)/sum(is_same)) %>%
mutate(f_cumsum_false=cumsum(!is_same)/sum(!is_same)) %>%
arrange(abs(f_cumsum_true - 0.75)) %>%
slice(1) %>% pull(value)
Threshold2
[1] 65.64459
Threshold
[1] 8.101735
g <- make_empty_graph(n = 0, directed=FALSE) +
vertices(paste0("N_",1:nrow(noisy)))
edges_vector <-
best_model_results %>%
filter(value >= Threshold) %>%
#filter(x != y) %>%
select(x, y) %>% #head(5) %>%
pmap(function(x, y){
c(x,y)
}) %>% unlist()
#edges_vector
gorder(g) %>% print()
[1] 429
[1] 0
#g
[1] 429
[1] 3517
g
IGRAPH 2ee511f UN-- 429 3517 --
+ attr: name (v/c)
+ edges from 2ee511f (vertex names):
[1] N_1--N_1 N_1--N_2 N_1--N_3 N_1--N_4 N_1--N_5 N_1--N_6
[7] N_1--N_7 N_1--N_8 N_1--N_9 N_1--N_10 N_1--N_11 N_1--N_2
[13] N_2--N_2 N_2--N_3 N_2--N_4 N_2--N_5 N_2--N_6 N_2--N_7
[19] N_2--N_8 N_2--N_9 N_2--N_10 N_2--N_11 N_1--N_3 N_2--N_3
[25] N_3--N_3 N_3--N_4 N_3--N_5 N_3--N_6 N_3--N_7 N_3--N_8
[31] N_3--N_9 N_3--N_10 N_3--N_11 N_1--N_4 N_2--N_4 N_3--N_4
[37] N_4--N_4 N_4--N_5 N_4--N_6 N_4--N_7 N_4--N_8 N_4--N_9
[43] N_4--N_10 N_4--N_11 N_1--N_5 N_2--N_5 N_3--N_5 N_4--N_5
+ ... omitted several edges
g_s <- decompose.graph(g)
gi <-
g_s %>%
sample(x = ., size=1) %>%
extract2(1)
print(length(g_s))
[1] 67
print(gi)
IGRAPH 2f38396 UN-- 7 56 --
+ attr: name (v/c)
+ edges from 2f38396 (vertex names):
[1] N_100--N_100 N_100--N_100 N_100--N_101 N_100--N_101 N_101--N_101
[6] N_101--N_101 N_100--N_102 N_100--N_102 N_101--N_102 N_101--N_102
[11] N_102--N_102 N_102--N_102 N_100--N_103 N_100--N_103 N_101--N_103
[16] N_101--N_103 N_102--N_103 N_102--N_103 N_103--N_103 N_103--N_103
[21] N_100--N_104 N_100--N_104 N_101--N_104 N_101--N_104 N_102--N_104
[26] N_102--N_104 N_103--N_104 N_103--N_104 N_104--N_104 N_104--N_104
[31] N_100--N_105 N_100--N_105 N_101--N_105 N_101--N_105 N_102--N_105
[36] N_102--N_105 N_103--N_105 N_103--N_105 N_104--N_105 N_104--N_105
+ ... omitted several edges
gwithn <- tibble(g=as.character(), n_nm=as.character(), n = as.integer())
for (i in 1:length(g_s)){
g_nm = paste0("g_",i)
#print(g_nm)
g_i <- as_ids(V(g_s[[i]]))
for (i_n in g_i){
gwithn <-
gwithn %>% add_row(g = g_nm, n_nm = i_n, n = as.integer(gsub(x=i_n, pattern="N_", replacement="")))
#print()
}
}
gwithn <- gwithn %>% distinct()
gwithn
# A tibble: 429 x 3
g n_nm n
<chr> <chr> <int>
1 g_1 N_1 1
2 g_1 N_2 2
3 g_1 N_3 3
4 g_1 N_4 4
5 g_1 N_5 5
6 g_1 N_6 6
7 g_1 N_7 7
8 g_1 N_8 8
9 g_1 N_9 9
10 g_1 N_10 10
# ... with 419 more rows
Above g is a group of records that are a person and n_nmis a record. For the fully disconnected graph we have predicted there to be 67 distinct people in 429 records. when there are in fact nrow(entities) people in the origional dataset
gwithn <- tibble(g=as.character(), n_nm=as.character(), n = as.integer())
g_c <- fastgreedy.community(simplify(as.undirected(g)))
walk(1:length(g_c), function(i_c){
ig <- g_c[[i_c]]
#print(i_c)
g_nm = paste0("g_", i_c)
walk(1:length(ig), function(i_n){
#print(ig[[i_n]])
n_nm <- ig[[i_n]]
gwithn <<-
gwithn %>% add_row(g = g_nm,
n_nm = n_nm,
n = as.integer(gsub(x=n_nm, pattern="N_", replacement=""))
)
})
})
gwithn
# A tibble: 429 x 3
g n_nm n
<chr> <chr> <int>
1 g_1 N_36 36
2 g_1 N_37 37
3 g_1 N_38 38
4 g_1 N_39 39
5 g_1 N_40 40
6 g_1 N_41 41
7 g_1 N_42 42
8 g_1 N_43 43
9 g_1 N_44 44
10 g_1 N_45 45
# ... with 419 more rows
For the community algorithm 67 distinct people in 429 records. when there are in fact nrow(entities) people in the origional dataset
plot(gi)
plot(g)
So now we will summarize the data for later in a data frame.
most_common_names_concat <- function(x, sep = "; ", n = 2){
x = as.character(x)
n2 = min(n, length(x))
tmp <- sort(table(x),decreasing=T)[1:n2]
paste0(names(tmp), "(",tmp, ")") %>% paste0(collapse=sep)
}
most_common_names_vector <- function(x, n = 2){
n2 = min(n, length(x))
tmp <- sort(table(x),decreasing=T)[1:n2]
tmp2 <- as_tibble(tmp)
tmp_l <- as.list(tmp2$n)
names(tmp_l) <- tmp2$x
tmp_l
}
unique_list <- function(x){
x %>% table() %>% sort(decreasing=T) %>% names() %>% list()
}
concat_vect_remove <- function(x, sep = "; "){
x_u <- x %>% unique()
x_u <- x_u[!is.na(x_u)]
paste0(x_u, collapse=sep)
}
count_unique <-function(x){
x_u <- x %>% unique()
x_u <- x_u[!is.na(x_u)]
length(x_u)
}
Now we know all the miss-spellings of each persons names.
de_noised <-
noisy %>%
mutate(., n = 1:nrow(.)) %>%
left_join(gwithn, by = "n") %>%
group_by(g) %>%
summarise(n = n(),
first_name = unique_list(first_name),
last_name = unique_list(last_name),
city = unique_list(city),
dob = unique_list(dob),
key_n = n_distinct(key),
key = unique_list(key)#,
#key = most_common_names_concat(key)
) %>% ungroup()%>%
arrange(desc(key_n))
de_noised
# A tibble: 67 x 8
g n first_name last_name city dob key_n key
<chr> <int> <list> <list> <list> <list> <int> <list>
1 g_11 30 <chr [8]> <chr [11]> <chr [16~ <chr [8~ 4 <chr [4~
2 g_27 27 <chr [10]> <chr [10]> <chr [12~ <chr [6~ 4 <chr [4~
3 g_31 11 <chr [1]> <chr [3]> <chr [7]> <chr [4~ 2 <chr [2~
4 g_1 12 <chr [3]> <chr [8]> <chr [5]> <chr [4~ 1 <chr [1~
5 g_10 11 <chr [4]> <chr [3]> <chr [5]> <chr [1~ 1 <chr [1~
6 g_12 9 <chr [3]> <chr [4]> <chr [4]> <chr [1~ 1 <chr [1~
7 g_13 9 <chr [4]> <chr [2]> <chr [2]> <chr [1~ 1 <chr [1~
8 g_14 9 <chr [5]> <chr [4]> <chr [7]> <chr [2~ 1 <chr [1~
9 g_15 9 <chr [4]> <chr [6]> <chr [1]> <chr [4~ 1 <chr [1~
10 g_16 9 <chr [5]> <chr [1]> <chr [1]> <chr [2~ 1 <chr [1~
# ... with 57 more rows
Make a form with validation checks! … Please…
print(R.version.string)
[1] "R version 4.0.4 (2021-02-15)"
sessionInfo(package = NULL)
R version 4.0.4 (2021-02-15)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 18363)
Matrix products: default
locale:
[1] LC_COLLATE=English_Canada.1252 LC_CTYPE=English_Canada.1252
[3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C
[5] LC_TIME=English_Canada.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] snakecase_0.11.0 lemon_0.4.5 igraph_1.2.6
[4] magrittr_2.0.1 AUC_0.3.0 plotROC_2.2.1
[7] phonics_1.3.9 reclin_0.1.1 ldat_0.3.3
[10] Rcpp_1.0.6 lvec_0.2.2 lexicon_1.2.1
[13] babynames_1.0.1 fuzzyjoin_0.1.6 lubridate_1.7.10
[16] janitor_2.1.0 forcats_0.5.1 stringr_1.4.0
[19] dplyr_1.0.5 purrr_0.3.4 readr_1.4.0
[22] tidyr_1.1.3 tibble_3.1.0 ggplot2_3.3.3
[25] tidyverse_1.3.0 knitr_1.31
loaded via a namespace (and not attached):
[1] fs_1.5.0 httr_1.4.2 tools_4.0.4
[4] backports_1.2.1 bslib_0.2.4 utf8_1.2.1
[7] R6_2.5.0 DBI_1.1.1 colorspace_2.0-0
[10] withr_2.4.1 tidyselect_1.1.0 gridExtra_2.3
[13] downlit_0.2.1 compiler_4.0.4 cli_2.4.0
[16] rvest_1.0.0 xml2_1.3.2 labeling_0.4.2
[19] sass_0.3.1 scales_1.1.1 digest_0.6.27
[22] rmarkdown_2.7 stringdist_0.9.6.3 pkgconfig_2.0.3
[25] htmltools_0.5.1.1 dbplyr_2.1.0 highr_0.8
[28] maps_3.3.0 rlang_0.4.10 readxl_1.3.1
[31] rstudioapi_0.13 jquerylib_0.1.3 generics_0.1.0
[34] farver_2.1.0 jsonlite_1.7.2 distill_1.2
[37] munsell_0.5.0 fansi_0.4.2 lifecycle_1.0.0
[40] stringi_1.5.3 yaml_2.2.1 plyr_1.8.6
[43] grid_4.0.4 parallel_4.0.4 crayon_1.4.1
[46] lattice_0.20-41 haven_2.3.1 hms_1.0.0
[49] pillar_1.5.1 lpSolve_5.6.15 reprex_1.0.0
[52] glue_1.4.2 evaluate_0.14 data.table_1.14.0
[55] modelr_0.1.8 vctrs_0.3.7 cellranger_1.1.0
[58] gtable_0.3.0 assertthat_0.2.1 xfun_0.22
[61] syuzhet_1.0.6 broom_0.7.5 ellipsis_0.3.1